dat <- read.csv(here("data", "20201215_EGM_Net_all-articles_clean.csv"))
dim(dat) #1019 16
## [1] 1019 16
#names(dat)
#hist(dat$year)
# #initial checks
# length(unique(dat$Title)) #1019
# title <- unique(dat$Title)
# length(unique(dat$DOI)) #only 184 DOI values and missing the value is stored as ""! (change to NA later?)
# table(dat$Item_Type) #mostly articles
# hist(dat$Pub_Year, breaks = 70)
# table(dat$Manual_Tags) #stored as vectors of characters, tags separated by "; "
# #table(dat$Item_Type)["journalArticle"] # 959 journal articles
DO NOT RUN - the outpt was saved to a file at the end of this chunk - this output is loaded at the start of the next chunk, to save long waiting time during LENS API query.
#subset of first 50 publications, (used for testing)
#dat50 <- dat[1:50, ] # use insted of "dat" in the code below
#Note: if more than 50 run receiving message code: 1 rate_limited Too many requests. Allowed '50' per minute. 429 ", the LENS API call loop was changed to accommodate this by adding waiting time between iterations (works slower but returns all available records)
#custom function for accessing LENS API
getLENSData <- function(token, query){
url <- 'https://api.lens.org/scholarly/search'
headers <- c('Authorization' = token, 'Content-Type' = 'application/json')
httr::POST(url = url, add_headers(.headers=headers), body = query)
}
#prepare request
max_results <- 500 #o limit number of records per request, max currently allowed is now 500
article_list <- dat$Title #use dat50$Titkle for testing
article_title <- article_list[1] #extract the first title from the list
readRenviron("~/.Renviron")
token <- Sys.getenv("LENS_TOKEN") #access token saved in the local environment
#initial query with the first title only
request <- paste0('{"query": {"match_phrase": {"title": "', paste0('', article_title,'"'),'}}, "size": "',max_results,'","scroll": "1m"}') #, "include": ["lens_id", "authors", "publication_type", "title"] #restrict to certain fields
data <- getLENSData(token, request)
record_json <- content(data, "text")
record_list <- jsonlite::fromJSON(record_json) #convert json output from article search to list
#names(record_list)
#str(record_list$data)
print("iteration = 1")
record_df <- data.frame(record_list) #convert it into a data frame
dim(record_df)
#names(record_df)
#record_df$data.title
#record_df$data.authors
#total <- record_list[["total"]] #1
#use a loop to run query for remaining titles and add them to a data frame of records one by one
for (i in 2:length(article_list)){
article_title <- article_list[i] #extract the title from the list
request <- paste0('{"query": {"match_phrase": {"title": "', paste0('', article_title,'"'),'}}, "size": "',max_results,'","scroll": "1m"}') #, "include": ["lens_id", "authors", "publication_type", "title"] #restrict to certain fields
data <- getLENSData(token, request)
print(paste("iteration = ",i))
record_json <- content(data, "text")
record_list <- jsonlite::fromJSON(record_json) # convert json output from article search to list
ifelse(record_list$total==0,
print(paste0("record not found for: ", article_title)),
{
new_df <- data.frame(record_list)
record_df <- dplyr::bind_rows(record_df, new_df) # bind the latest search data frame to the previous data frame
}
)
Sys.sleep(2.5) #make it slow down to not exceed 50 calls per min
}
dim(record_df) #1073 out of 1019 - some titles matched multiple records
names(record_df)
length(unique(record_df$data.title)) #1073 out of 1019 - some titles matched multiple records, 45 not found
#View(cbind(article_list, record_df$data.title)) #shifted positions!
class(record_df) #data frame, but it has nested lists, etc.
save(record_df, file = here("data", "LENS_dataframe.RData")) #save LENS output as a Rdata object
Information about LENS output values: https://docs.api.lens.org/response-scholar.html
#start from loading save output from LENS
load("./data/LENS_dataframe.RData") #loads record_df data object
names(record_df)
## [1] "scroll_id" "total"
## [3] "data.lens_id" "data.title"
## [5] "data.publication_type" "data.year_published"
## [7] "data.date_published_parts" "data.created"
## [9] "data.external_ids" "data.authors"
## [11] "data.source" "data.fields_of_study"
## [13] "data.volume" "data.issue"
## [15] "data.languages" "data.references"
## [17] "data.source_urls" "data.abstract"
## [19] "data.references_count" "data.scholarly_citations_count"
## [21] "data.start_page" "data.end_page"
## [23] "data.scholarly_citations" "data.author_count"
## [25] "results" "data.date_published"
## [27] "data.keywords" "data.publication_supplementary_type"
## [29] "data.mesh_terms" "data.chemicals"
## [31] "data.open_access" "data.funding"
## [33] "data.is_open_access" "data.conference"
dim(record_df)
## [1] 1073 34
length(unique(record_df$data.title)) #974 unique titles
## [1] 974
#View(record_df[duplicated(record_df$data.title) | duplicated(record_df$data.title, fromLast=TRUE), ]) #visual check - some records have more info than others
sum(sapply(record_df$data.fields_of_study, is.null)) #number of records without fields_of_study, also likely to have other missing data
## [1] 129
record_df$data.has_fields_of_study <- sapply(record_df$data.fields_of_study, is.null)
record_df %>% arrange(data.has_fields_of_study, data.title) %>% distinct(data.title, .keep_all = TRUE) -> record_df_unique #place the records without fields_of_study at the end and remove duplicates dim(record_df_unique) #check dimensions
record_df <- record_df_unique #reassign
dim(record_df) #974
## [1] 974 35
par(mar=c(4,4,2,2))
hist(record_df$data.year_published, main = "Publication year")
par(mar=c(4,15,2,2))
barplot(sort(table(record_df$data.publication_type)), horiz=TRUE, las=1, xlab = "Count", main = "Publication type")
par(mar=c(4,18,2,2))
barplot(sort(sort(table(record_df$data.source$title),decreasing = TRUE)[1:10]), horiz=TRUE, las=1, xlab = "Count", main = "Top10 publication sources (journals)")
par(mar=c(4,10,2,2))
barplot(sort(sort(table(unlist(record_df$data.fields_of_study)),decreasing = TRUE)[1:10]), horiz=TRUE, las=1, xlab = "Count", main = "Top10 fields of study")
#record_df$data.fields_of_study[[1]][1] #first value from every list for first paper
#record_df$data.fields_of_study[1] #first value from every list for every paper
par(mar=c(4,5,2,2))
barplot(table(!is.na(record_df$data.scholarly_citations_count)), horiz=TRUE, las=1, xlab = "Count", main = "Has count of citations?") #NA should be probably 0
hist(record_df$data.scholarly_citations_count, main = "Number of citations", breaks=100) #ignoring NA
# length(unlist(record_df$data.scholarly_citations)) #45097 total citation LENS ids
par(mar=c(4,5,2,2))
barplot(table(!is.na(record_df$data.references_count)), horiz=TRUE, las=1, xlab = "Count", main = "Has references?")
# length(unlist(record_df$data.references)) #30659 total reference LENS ids
doi <- unlist(lapply(record_df$data.external_ids, function(ch) expss::vlookup('doi', ch, result_column = 'value', lookup_column = 'type'))) #extracting doi for each article
par(mar=c(4,5,2,2))
barplot(table(!is.na(doi)), horiz=TRUE, las=1, xlab = "Count", main = "Has doi?")
Extract authors data.
#unnest first level of lists for data.authors
record_df_data.authors <- record_df %>% select(data.lens_id, data.title, data.publication_type, data.year_published, data.authors) %>% unnest(data.authors)
#str(record_df_data.authors) #a tibble with some lists of data frames for ids and affiliations
names(record_df_data.authors)
## [1] "data.lens_id" "data.title" "data.publication_type"
## [4] "data.year_published" "first_name" "last_name"
## [7] "initials" "ids" "affiliations"
#unnest author ids from record_df_data.authors:
record_df_data.authors.ids <- record_df_data.authors %>% select(data.lens_id, data.title, data.publication_type, data.year_published, first_name, last_name, initials, ids) %>% unnest(ids)
dim(record_df_data.authors.ids) #4052
## [1] 4052 9
#names(record_df_data.authors.ids) #a tibble with unnested ids use for igraph: data.lens_id value
record_df_data.authors.ids$Author <- paste(record_df_data.authors.ids$last_name, record_df_data.authors.ids$initials, sep=", ") #add a new column with author name made of las_name and initials
## check overlaps and inconsistencies:
#record_df_data.authors.ids %>% count(Author, type, value, sort = FALSE) %>% View # some authors appear multiple times when multiple ids types are available for them (e.g. Aavik, T, Park, KJ, Anderson, SH) - selectively remove orcid (as less common).
#record_df_data.authors.ids %>% count(type, Author, value, sort = FALSE) %>% View # some authors with multiple ids of the same type (e.g. Bigler, F, Brandle, JR, Bright, JA)
#record_df_data.authors.ids %>% count(value, type, Author, sort = FALSE) %>% View # some "multiple" authors per id
#View(arrange(record_df_data.authors.ids[duplicated(record_df_data.authors.ids$value) | duplicated(record_df_data.authors.ids$value, fromLast=TRUE), ], value)) #visual check - some records have more info than others
value_check_df <- arrange(record_df_data.authors.ids[duplicated(record_df_data.authors.ids$value) | duplicated(record_df_data.authors.ids$value, fromLast=TRUE), ], value)
value_check_df$value_Author <- paste(value_check_df$type, value_check_df$value, value_check_df$Author, sep="-")
#View(duplicated(distinct(value_check_df, value_Author, .keep_all = TRUE)), ) #654 rows to be checked!
value_check_df %>%
group_by(value) %>%
filter(n()>1) %>%
distinct(value_Author, .keep_all = TRUE) -> value_check_df2
names(value_check_df2)
## [1] "data.lens_id" "data.title" "data.publication_type"
## [4] "data.year_published" "first_name" "last_name"
## [7] "initials" "type" "value"
## [10] "Author" "value_Author"
check_values <- value_check_df2$value[duplicated(value_check_df2$value)] #doi to check
##substitute values with correct author first name and initials:
#1
#orcid 0000-0001-5002-106X "Reberg-Horton, SC" to Reberg-Horton, C
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[1], c("value", "Author", "first_name", "initials")]
#View(record_df_data.authors.ids[record_df_data.authors.ids$last_name == "Reberg-Horton", ]) #one orcid, 3 different magid! - remove all rows with magid, make first name Chris and initials C
record_df_data.authors.ids <- subset(record_df_data.authors.ids, last_name != "Reberg-Horton" | type != "magid")
dim(record_df_data.authors.ids) #remove all records with magid
## [1] 4047 10
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0001-5002-106X" & record_df_data.authors.ids$last_name == "Reberg-Horton"] <- "Chris"
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0001-5002-106X" & record_df_data.authors.ids$Author == "Reberg-Horton, SC"] <- "C"
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0001-5002-106X" & record_df_data.authors.ids$Author == "Reberg-Horton, SC"] <- "Reberg-Horton, C"
#2
#orcid 0000-0001-5069-0204 "Baker, M" to "Baker, ME" :
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[2], c("value", "Author", "first_name", "initials")]
#record_df_data.authors.ids[record_df_data.authors.ids$value == "0000-0001-5069-0204", ]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0001-5069-0204" & record_df_data.authors.ids$Author == "Baker, M"] <- "Matthew E."
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0001-5069-0204" & record_df_data.authors.ids$Author == "Baker, M"] <- "ME"
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0001-5069-0204" & record_df_data.authors.ids$Author == "Baker, M"] <- "Baker, ME"
#3
#orcid 0000-0001-6431-9959 "Pywell, R" to "Pywell, RF"
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[3], c("value", "Author", "first_name", "initials")]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0001-6431-9959" & record_df_data.authors.ids$Author == "Pywell, R"] <- "Richard F."
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0001-6431-9959" & record_df_data.authors.ids$Author == "Pywell, R"] <- "RF"
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0001-6431-9959" & record_df_data.authors.ids$Author == "Pywell, R"] <- "Pywell, RF"
#4
#orcid 0000-0001-9558-0586 "Marshall, E" to "Marshall, EJP"
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[4], c("value", "Author", "first_name", "initials")]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0001-9558-0586" & record_df_data.authors.ids$Author == "Marshall, E"] <- "E. J. P."
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0001-9558-0586" & record_df_data.authors.ids$Author == "Marshall, E"] <- "EJP"
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0001-9558-0586" & record_df_data.authors.ids$Author == "Marshall, E"] <- "Marshall, EJP"
#5
#orcid 0000-0002-1800-4558 - Alain Butet- fix orcid 0000-0002-9173-3466
#orcid 0000-0002-1800-4558 - Agnes Fargue-Lelievre - fix orcid 0000-0002-0426-8931 and first name to Agnes
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[5], c("value", "Author", "first_name", "initials")]
record_df_data.authors.ids$value[record_df_data.authors.ids$value == "0000-0002-1800-4558" & record_df_data.authors.ids$Author == "Butet, A"] <- "0000-0002-9173-3466"
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0002-1800-4558" & record_df_data.authors.ids$first_name == "A."] <- "Agnes"
record_df_data.authors.ids$value[record_df_data.authors.ids$value == "0000-0002-1800-4558" & record_df_data.authors.ids$Author == "Fargue-Lelièvre, A"] <- "0000-0002-0426-8931"
#6
#orcid 0000-0002-4202-2043 fix "NORRDAHL, K" "K" to "Norrdahl, K" "Kai"
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[6], c("value", "Author", "first_name", "initials", "last_name")]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0002-4202-2043" & record_df_data.authors.ids$Author == "NORRDAHL, K"] <- "Kai"
record_df_data.authors.ids$last_name[record_df_data.authors.ids$value == "0000-0002-4202-2043" & record_df_data.authors.ids$Author == "NORRDAHL, K"] <- "Norrdahl"
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0002-4202-2043" & record_df_data.authors.ids$Author == "NORRDAHL, K"] <- "Norrdahl, K"
#7
#orcid 0000-0003-0300-9951 "Woodcock, B" "B.A." B" to "Woodcock, BA" "Ben A."
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[7], c("value", "Author", "first_name", "initials")]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0003-0300-9951" & record_df_data.authors.ids$Author == "Woodcock, B"] <- "Ben A."
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0003-0300-9951" & record_df_data.authors.ids$Author == "Woodcock, B"] <- "BA"
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0003-0300-9951" & record_df_data.authors.ids$Author == "Woodcock, B"] <- "Woodcock, BA"
#8
#orcid 0000-0003-1416-6047 "Bourn, NAD" to "Bourn, NA"
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[8], c("value", "Author", "first_name", "initials")]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0003-1416-6047" & record_df_data.authors.ids$Author == "Bourn, NAD"] <- "Nigel A.D."
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0003-1416-6047" & record_df_data.authors.ids$Author == "Bourn, NAD"] <- "NA"
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0003-1416-6047" & record_df_data.authors.ids$Author == "Bourn, NAD"] <- "Bourn, NA"
#9
#orcid 0000-0003-3616-5563 "FINN, J" to "Finn, JA"
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[9], c("value", "Author", "first_name", "initials")]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0003-3616-5563" & record_df_data.authors.ids$Author == "FINN, J"] <- "John A."
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0003-3616-5563" & record_df_data.authors.ids$Author == "FINN, J"] <- "JA"
record_df_data.authors.ids$last_name[record_df_data.authors.ids$value == "0000-0003-3616-5563" & record_df_data.authors.ids$Author == "FINN, J"] <- "Finn"
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0003-3616-5563" & record_df_data.authors.ids$Author == "FINN, J"] <- "Finn, JA"
#10
#orcid 0000-0003-3742-7035 "Holland, J" "J.M." to "Holland, JM" "John M."
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[10], c("value", "Author", "first_name", "initials")]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0003-3742-7035" & record_df_data.authors.ids$Author == "Holland, J"] <- "John M."
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0003-3742-7035" & record_df_data.authors.ids$Author == "Holland, J"] <- "JM"
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0003-3742-7035" & record_df_data.authors.ids$Author == "Holland, J"] <- "Holland, JM"
#11
#orcid 000-0003-4225-9451 "Rahman, M" to "Rahman, MM"
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[11], c("value", "Author", "first_name", "initials")]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0003-4225-9451" & record_df_data.authors.ids$Author == "Rahman, M"] <- "Mizanur Md."
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0003-4225-9451" & record_df_data.authors.ids$Author == "Rahman, M"] <- "MM"
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0003-4225-9451" & record_df_data.authors.ids$Author == "Rahman, M"] <- "Rahman, MM"
#12
#orcid 0000-0003-4382-7051 "Sparks, T" "T.H" to "Sparks, TH" "Tim H."
#record_df_data.authors.ids[record_df_data.authors.ids$value == check_values[12], c("value", "Author", "first_name", "initials")]
record_df_data.authors.ids$first_name[record_df_data.authors.ids$value == "0000-0003-4382-7051" & record_df_data.authors.ids$Author == "Sparks, T"] <- "Tim H."
record_df_data.authors.ids$initials[record_df_data.authors.ids$value == "0000-0003-4382-7051" & record_df_data.authors.ids$Author == "Sparks, T"] <- "TH"
record_df_data.authors.ids$Author[record_df_data.authors.ids$value == "0000-0003-4382-7051" & record_df_data.authors.ids$Author == "Sparks, T"] <- "Sparks, TH"
# Some of these will need to be manually resolved - e.g. checking if these are really different people
record_df_data.authors.ids <- subset(record_df_data.authors.ids, last_name != "Reberg-Horton" | type != "magid")
dim(record_df_data.authors.ids) #remove all records with magid
## [1] 4047 10
#check ids
table(is.na(record_df_data.authors.ids$type)) #no missing values
##
## FALSE
## 4047
table(is.na(record_df_data.authors.ids$value)) #no missing values
##
## FALSE
## 4047
table(record_df_data.authors.ids$type)
##
## magid orcid
## 3468 579
#clean by removing orcid if magid is available
record_df_data.authors.ids %>%
group_by(data.title, Author) %>%
filter(type==min(type)) -> record_df_data.authors.ids
table(record_df_data.authors.ids$type) #3482 - magid, 23 - orcid
##
## magid orcid
## 3468 11
#check how many authors with same name but different ids
record_df_data.authors.ids %>%
group_by(Author) %>%
summarise(count = n_distinct(value)) %>%
filter(count > 1) -> check_next
dim(check_next) #135 Authors with multiple id values - further checking needed, e.g.:
## [1] 134 2
#record_df_data.authors.ids[record_df_data.authors.ids$Author == "Anderson, SH",] #same as below
#record_df_data.authors.ids[record_df_data.authors.ids$Author == as.character(check_next[1,"Author"]), ]
####### finished cleaning ids data frame for now
#unnest author affiliations from record_df_data.authors:
record_df_data.authors.aff <- record_df_data.authors %>% select(data.lens_id, data.title, data.publication_type, data.year_published, first_name, last_name, initials, affiliations) %>% unnest(affiliations)
dim(record_df_data.authors.aff) #2968 - some articles have missing data, esp country ID
## [1] 2968 10
#names(record_df_data.authors.aff) #a tibble with unnested ids use for igraph: data.lens_id value
table(is.na(record_df_data.authors.aff$country_code)) #missing country ID - may need to impute - try https://www.grid.ac/ Global Research Identifier Database (GRID) with name (affiliation name)
##
## FALSE TRUE
## 2083 885
####### do affiliation cleaning later
Processing summary:
- Author data: partially cleaned, a bit more to do.
- Affiliation data: not bad, but needs cleaning and imputation of missing values.
Note: neds to be redone after cleaning and imputing affiliation data.
#using country code and institution grid.id combined as a unique identifier
record_df_data.authors.aff$country_grid.id <- paste(record_df_data.authors.aff$country_code, record_df_data.authors.aff$grid_id, sep=", ")
record_df_data.authors.aff$Author <- paste(record_df_data.authors.aff$last_name, record_df_data.authors.aff$initials, sep=", ")
#prepare data frame for igraph
dti <- data.frame(pub.id = record_df_data.authors.aff$data.lens_id, Author = record_df_data.authors.aff$Author, value = record_df_data.authors.aff$country_grid.id)
str(dti)
## 'data.frame': 2968 obs. of 3 variables:
## $ pub.id: chr "091-265-753-392-171" "091-265-753-392-171" "091-265-753-392-171" "091-265-753-392-171" ...
## $ Author: chr "MacLeod, A" "Wratten, SD" "Wratten, SD" "Sotherton, NW" ...
## $ value : chr "GB, grid.5491.9" "GB, grid.5491.9" "NA, NA" "NA, NA" ...
dti %>%
inner_join(dti, by = "pub.id") %>%
filter(Author.x < Author.y) %>%
count(value.x, value.y) %>%
graph_from_data_frame(directed = FALSE) -> g1i
#as_data_frame(g1i, what = "edges")
# plot(g1i)
# plot(g1i, edge.arrow.size=0, vertex.color="gold", vertex.size=5,
# vertex.frame.color="gray", vertex.label.color="black",
# vertex.label.cex=0.8, vertex.label.dist=2, edge.curved=0.2)
#E(g1i)$weight <- 1 #add weights
g1is <- simplify(g1i, remove.multiple = T, remove.loops = T,
edge.attr.comb=list(weight="sum", "ignore") ) #simplify
plot(g1is, layout=layout_nicely, edge.arrow.size=0, vertex.color="gold", vertex.size=5, vertex.frame.color="gray",
vertex.label.color="black", vertex.label.cex=0.8, vertex.label.dist=2, edge.curved=0.2)
#E(g1is)
#V(g1is)
##Find cliques (complete subgraphs of an undirected graph)
# cliques(as.undirected(g1is)) # list of cliques
# sapply(cliques(as.undirected(g1is)), length) # clique sizes
# largest_cliques(as.undirected(g1is)) # clique with max number of nodes
# vcol <- rep("grey80", vcount(as.undirected(g1is)))
# vcol[unlist(largest_cliques(as.undirected(g1is)))] <- "gold"
# plot(as.undirected(g1is), vertex.label=V(g1is)$name, vertex.color=vcol)
##Community detection based on edge betweenness (Newman-Girvan)
##High-betweenness edges are removed sequentially (recalculating at each step) and the best partitioning of the network is selected.
cebi <- cluster_edge_betweenness(as.undirected(g1is))
#dendPlot(cebi, mode="hclust") #too dense
plot(cebi, as.undirected(g1is), layout=layout_nicely, edge.arrow.size=0.0, vertex.color="gold", vertex.size=2,
vertex.frame.color="grey", vertex.label.color="black", vertex.label=NA)
#more at: https://kateto.net/netscix2016.html, pretty_plots.R
#text cleaning for word analysis: https://lukesingham.com/how-to-make-a-word-cloud-using-r/